suppressPackageStartupMessages({
library(ArchR)
library(chromVAR)
library(tidyverse)
library(SingleCellExperiment)
library(zellkonverter)
library(dtwclust)
library(BSgenome.Mmusculus.UCSC.mm10)
library(motifmatchr)
library(chromVARmotifs)
})
dir_path <- "/omics/groups/OE0533/internal/katharina/scDoRI/gastrulation_data/jupyter_notebooks/SEACell_files/SEA_aggregates_to_R/"
proj <- loadArchRProject("11_added_Ricards_peaks_p2g_entire_chromosome/")
## Successfully loaded ArchRProject!
##
## / |
## / \
## . / |.
## \\\ / |.
## \\\ / `|.
## \\\ / |.
## \ / |\
## \\#####\ / ||
## ==###########> / ||
## \\##==......\ / ||
## ______ = =|__ /__ || \\\
## ,--' ,----`-,__ ___/' --,-`-===================##========>
## \ ' ##_______ _____ ,--,__,=##,__ ///
## , __== ___,-,__,--'#' ===' `-' | ##,-/
## -,____,---' \\####\\________________,--\\_##,/
## ___ .______ ______ __ __ .______
## / \ | _ \ / || | | | | _ \
## / ^ \ | |_) | | ,----'| |__| | | |_) |
## / /_\ \ | / | | | __ | | /
## / _____ \ | |\ \\___ | `----.| | | | | |\ \\___.
## /__/ \__\ | _| `._____| \______||__| |__| | _| `._____|
##
peaks <- getPeakSet(proj)
class(peaks)
## [1] "GRanges"
## attr(,"package")
## [1] "GenomicRanges"
Color Palette:
colPalette_celltypes = c('#532C8A',
'#c19f70',
'#f9decf',
'#c9a997',
'#B51D8D',
'#3F84AA',
'#9e6762',
'#354E23',
'#F397C0',
'#ff891c',
'#635547',
'#C72228',
'#f79083',
'#EF4E22',
'#989898',
'#7F6874',
'#8870ad',
'#647a4f',
'#EF5A9D',
'#FBBE92',
'#139992',
'#cc7818',
'#DFCDE4',
'#8EC792',
'#C594BF',
'#C3C388',
'#0F4A9C',
'#FACB12',
'#8DB5CE',
'#1A1A1A',
'#C9EBFB',
'#DABE99',
'#65A83E',
'#005579',
'#CDE088',
'#f7f79e',
'#F6BFCB')
celltypes <- (as.data.frame(getCellColData(proj)) %>% group_by(celltypes) %>%
summarise(n = n()))$celltypes
col <- setNames(colPalette_celltypes, celltypes)
Based on the SEACell metacells, we created an aggregated peak matrix which we read into R and create a SummarizedExperiment for downstream analysis.
atac_agg_rowData <- read_csv(paste0(dir_path, "atac_agg_rowData.csv"))
atac_agg_rowData <- atac_agg_rowData %>% column_to_rownames("...1")
atac_agg_colData <- read_csv(paste0(dir_path, "atac_agg_colData.csv"))
atac_agg_colData <- atac_agg_colData %>% column_to_rownames("...1")
peak_agg_matrix <- as.matrix(read_csv(paste0(dir_path, "peak_agg_matrix.csv")))
peak_agg_matrix <- t(peak_agg_matrix)
dim(peak_agg_matrix)
atac_peak_names <- read_csv(paste0(dir_path, "atac_peak_names.csv"))
atac_cell_names <- read_csv(paste0(dir_path, "atac_cell_names.csv"))
rownames(peak_agg_matrix) <- atac_peak_names$`0`
colnames(peak_agg_matrix) <- atac_cell_names$`0`
peak_agg_matrix[1:5, 1:5]
Below you can see the GRanges object for our peaks:
peaks %>% head
## GRanges object with 6 ranges and 4 metadata columns:
## seqnames ranges strand | score idx
## <Rle> <IRanges> <Rle> | <numeric> <integer>
## chr1:3035600-3036200 chr1 3035600-3036200 * | 11.52670 1
## chr1:3062691-3063291 chr1 3062691-3063291 * | 12.59330 2
## chr1:3072272-3072872 chr1 3072272-3072872 * | 9.29753 3
## chr1:3191513-3192113 chr1 3191513-3192113 * | 7.86981 4
## chr1:3466250-3466850 chr1 3466250-3466850 * | 12.11960 5
## chr1:3482737-3483337 chr1 3482737-3483337 * | 32.71030 6
## GC N
## <numeric> <numeric>
## chr1:3035600-3036200 0.4160 0
## chr1:3062691-3063291 0.4493 0
## chr1:3072272-3072872 0.3677 0
## chr1:3191513-3192113 0.4060 0
## chr1:3466250-3466850 0.4126 0
## chr1:3482737-3483337 0.4160 0
## -------
## seqinfo: 20 sequences from an unspecified genome; no seqlengths
Create a Summarized Experiments from the SEACell aggregate peak matrix
peak_sea <- SummarizedExperiment(assays = list(counts = peak_agg_matrix),
rowRanges = peaks,
colData = atac_agg_colData)
colnames(colData(peak_sea)) <- c("depth")
#saveRDS(peak_sea, paste0(dir_path, "sea_peak_sce.Rds"))
peak_sea <- readRDS(paste0(dir_path, "sea_peak_sce.Rds"))
peak_sea
## class: RangedSummarizedExperiment
## dim: 180499 613
## metadata(0):
## assays(1): counts
## rownames(180499): chr1:3035600-3036200 chr1:3062691-3063291 ...
## chrX:169915632-169916232 chrX:169925539-169926139
## rowData names(4): score idx GC N
## colnames(613): E8.0_rep2#CGCATTTGTTGCATCT-1
## E8.75_rep1#AGTGCCGGTTAGGTTG-1 ... E8.75_rep1#AATCGCCCATACTCCT-1
## E8.0_rep2#AGCAGGTAGCTTCCCG-1
## colData names(1): depth
peak_sea <- addGCBias(peak_sea,
genome = BSgenome.Mmusculus.UCSC.mm10)
Here we use the motif annotations from ArchR, in order to be able to better compare the annotations with the ArchR deviation results on single cells.
proj <- addMotifAnnotations(ArchRProj = proj, motifSet = "cisbp", name = "Motif")
motifs <- getPeakAnnotation(proj, "Motif")
motif_ix <- matchMotifs(motifs$motifs, # motif pwm matrix
peak_sea, # peak accessibility matrix
genome = BSgenome.Mmusculus.UCSC.mm10) # genome
dev <- computeDeviations(object = peak_sea, annotations = motif_ix)
dev
# remove index number from TFs
rownames(dev) <- str_remove(rownames(dev), "_(?=[0-9])")
variability <- computeVariability(dev)
#saveRDS(dev, paste0(dir_path, "SEACell_ChromVarDev"))
#saveRDS(variability, paste0(dir_path, "SEACell_ChromVarDev_variability"))
write.csv(deviations(dev), paste0(dir_path, "deviations.csv"))
write.csv(deviationScores(dev), paste0(dir_path, "deviationScores.csv"))
write.csv(variability, paste0(dir_path, "variability.csv"))
# read in the deviations
dev <- readRDS(paste0(dir_path, "SEACell_ChromVarDev"))
variability <- readRDS(paste0(dir_path, "SEACell_ChromVarDev_variability"))
sea_archr_meta <- read_csv("/omics/groups/OE0533/internal/katharina/scDoRI/gastrulation_data/jupyter_notebooks/SEACell_files/archr_sea_metadata.csv")
## Rows: 45986 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): index, Sample, celltypes, SEACell
## dbl (18): TSSEnrichment, ReadsInTSS, ReadsInPromoter, ReadsInBlacklist, Prom...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
celltype_df <- sea_archr_meta %>%
dplyr::count(SEACell, celltypes) %>%
dplyr::group_by(SEACell) %>%
slice_max(order_by=n, n=1, with_ties = FALSE) %>%
select(SEACell, celltypes)
# sea_archr_meta %>%
# dplyr::count(SEACell, celltypes) %>%
# dplyr::group_by(SEACell) %>%
# slice_max(order_by=n, n=1) %>%
# pull(celltypes, SEACell) %>% head
p1 <- celltype_df %>%
ggplot() + geom_bar(aes(x = celltypes, fill = celltypes)) +
theme(legend.position = "None",
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
scale_fill_manual(values = col) +
labs(title = "Number of metacells with particular celltype")
p2 <- sea_archr_meta %>%
ggplot() +
geom_bar(aes(x = celltypes, fill = celltypes)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), legend.position = "None") +
scale_fill_manual(values = col)
labs(title = "Number of single cells with particular celltype")
## $title
## [1] "Number of single cells with particular celltype"
##
## attr(,"class")
## [1] "labels"
cowplot::plot_grid(p1, p2, ncol = 1)
x <- (sea_archr_meta %>% group_by(SEACell) %>%
summarise(cell_n = n()) %>%
ungroup())$cell_n
y <- (sea_archr_meta %>%
dplyr::count(SEACell, celltypes) %>%
dplyr::group_by(SEACell) %>%
slice_max(order_by=n, n=1, with_ties = FALSE))$n
ggplot() +
geom_point(aes (x = x,
y = y)) +
geom_line(aes(x = x, y = x, color = "orange")) +
geom_line(aes(x = x, y = x * 0.8), color = "blue") +
theme(legend.position = "None") +
labs(x = "Number of single cells per SEACell",
y = "Number of single cells with most common celltype per SEACell")
x <- (sea_archr_meta %>% filter(SEACell %in%
(celltype_df %>% filter(celltypes == "Cardiomyocytes"))$SEACell) %>%
group_by(SEACell) %>%
summarise(cell_n = n() ) %>% ungroup())$cell_n
y = (sea_archr_meta %>%
filter(SEACell %in% (celltype_df %>% filter(celltypes == "Cardiomyocytes"))$SEACell) %>%
dplyr::count(SEACell, celltypes) %>%
dplyr::group_by(SEACell) %>%
slice_max(order_by=n, n=1, with_ties = FALSE))$n
ggplot() + geom_point(aes(x = x, y = y)) +
labs(title = "Cardiomyocyte SEACells",
x = "Number of single cells in metacells",
y = "Number of cells with Cardiomyocyte label")
df <- colData(dev) %>% as.data.frame() %>% rownames_to_column("SEACell")
df <- left_join(celltype_df, df, by = "SEACell")
colData(dev) <- DataFrame(df)
motif_mtx <- assays(dev)[[1]]
tfs <- rownames(dev)
metadata <- colData(dev) %>% as.data.frame()
colnames(motif_mtx) <- metadata$SEACell
gatas <- c("Gata1", "Gata2", "Gata3", "Gata4", "Gata5", "Gata6")
plots <- map (gatas, function(n){
motif_n <- motif_mtx[rownames(motif_mtx) == tfs[grepl(paste0("^", n), tfs)], ]
p2 <- metadata %>%
mutate(!!n := motif_n) %>%
group_by(celltypes) %>%
#summarise_at(vars(n), funs(mean(., na.rm=TRUE)))
summarise(mean = mean(!!(sym(n)))) %>%
ggplot() +
geom_bar(aes(x = celltypes, y = mean, fill = celltypes), stat = "identity") +
scale_fill_manual(values = col) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), legend.position = "None") +#%>% print()
labs(y = "SEACell deviation score")
p1 <- metadata %>%
mutate(!!n := motif_n) %>%
#group_by(SEACell) %>% #, celltypes) %>%
#summarise_at(vars(n), funs(mean(., na.rm=TRUE)))
#summarise(mean = mean(!!(sym(n)))) %>%
ggplot() +
geom_boxplot(aes(x = celltypes, y = !!(sym(n)),
fill = celltypes)) +
geom_jitter(aes(x = celltypes,
y = !!(sym(n))), color="black",
size=0.4, alpha=0.9) +
# geom_point(aes(x = celltypes, y = !!(sym(n))),
# position = position_dodge(width = .5))) +
scale_fill_manual(values = col) +
theme(legend.position = "None",
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
labs(title = paste0(n),y = "SEACell deviation score")
plot <- cowplot::plot_grid(p1, p2, ncol = 1)
})
do.call(what = gridExtra::grid.arrange, args = append(plots, list(ncol = 2)))
n = "Gata1"
motif_n <- motif_mtx[rownames(motif_mtx) == tfs[grepl(paste0("^", n), tfs)], ]
metadata %>%
mutate(!!n := motif_n) %>%
ggplot() +
geom_bar(aes(x = SEACell, y = !!(sym(n))), stat = "identity", width = 5) +
geom_hline(yintercept = 0) +
#scale_fill_manual(values = col) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), legend.position = "None") +#%>% print()
labs(title = paste0(n), y = "SEACell deviation score") +
facet_wrap(~ celltypes, ncol = 3)
## Warning: position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
metadata %>%
mutate(!!n := motif_n) %>%
filter(celltypes == "Cardiomyocytes")
## SEACell celltypes depth Gata1
## 1 E8.5_rep1#GTAGTTTCAGCACGAA-1 Cardiomyocytes 9064.933 0.012132169
## 2 E8.5_rep2#CAAGTATGTAATGACT-1 Cardiomyocytes 10973.384 -0.008163193
## 3 E8.5_rep2#TTGTTCCCATGGTTAT-1 Cardiomyocytes 15322.527 -0.010098280
## 4 E8.75_rep1#AGGCCCAGTACCAGGT-1 Cardiomyocytes 5357.824 -0.009610496
## 5 E8.75_rep1#GCGCGATTCTTGTTCG-1 Cardiomyocytes 15861.952 0.007847011
To check why the deviation scores are worse than expected, I removed any SEACells which are less than 80% pure, meaning that less than 80% of cells belong to the same celltype.
pure_seacells <- (sea_archr_meta %>%
dplyr::group_by(SEACell) %>% #head
mutate(cells_per_SEA = n()) %>%
ungroup() %>%
dplyr::count(SEACell, celltypes, cells_per_SEA) %>%
dplyr::group_by(SEACell) %>%
mutate(percent = n/cells_per_SEA) %>%
filter(percent > .8))$SEACell
print(paste0("Out of ", nrow(metadata), " SEACells only ", length(pure_seacells),
" are above 80% composed of the same celltype."))
## [1] "Out of 613 SEACells only 355 are above 80% composed of the same celltype."
plots <- map(gatas, function(n){
motif_n <- motif_mtx[rownames(motif_mtx) == tfs[grepl(paste0("^", n), tfs)], pure_seacells]
p2 <- metadata %>%
filter(SEACell %in% pure_seacells) %>%
mutate(!!n := motif_n) %>%
group_by(celltypes) %>%
summarise(mean = mean(!!(sym(n)))) %>%
ggplot() +
geom_bar(aes(x = celltypes, y = mean, fill = celltypes), stat = "identity") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), legend.position = "None")+
scale_fill_manual(values = col) +
labs( y = "SEACell deviation score")
p1 <- metadata %>%
filter(SEACell %in% pure_seacells) %>%
mutate(!!n := motif_n) %>%
ggplot() +
geom_boxplot(aes(x = celltypes, y = !!(sym(n)), fill = celltypes)) +
geom_jitter(aes(x = celltypes,
y = !!(sym(n))), color="black",
size=0.4, alpha=0.9) +
scale_fill_manual(values = col) +
theme(legend.position = "None",
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
labs(title = paste0(n),y = "SEACell deviation scores")
plots <- cowplot::plot_grid(p1, p2, ncol = 1)
})
do.call(what = gridExtra::grid.arrange, args = append(plots, list(ncol = 2)))
# add raw counts to python
rna_seurat <- readRDS("Seurat_objects/rna_Seurat_object")
raw_counts<- rna_seurat@assays$originalexp@counts[rna_genes$`0`, ]
write.csv(raw_counts, '/omics/groups/OE0533/internal/katharina/scDoRI/gastrulation_data/jupyter_notebooks/SEACell_files/ArchR_object/raw_gene_expr.csv', quote=FALSE)
rna_mat <- read_csv(paste0(dir_path, "rna_agg_matrix.csv"))
rna_cells <- read_csv(paste0(dir_path, "rna_cell_names.csv"))
rna_genes <- read_csv(paste0(dir_path, "rna_gene_names.csv"))
dim(rna_mat)
rna_mat <- t(rna_mat)
rownames(rna_mat) <- rna_genes$`0`
colnames(rna_mat) <- rna_cells$`0`
test <- SummarizedExperiment(assays = list(counts = rna_mat),
#rowRanges = gene_anno,
colData = DataFrame(df))
#
#
# # normalize counts
# norm_rna_mat <- log1p(t(t(rna_mat) / colSums(rna_mat)) * 1e4)
# #rownames(norm_rna_mat) <- rna_genes$"0"
# colnames(norm_rna_mat) <- rna_cells$"0"
gene_anno <- getGenes(proj) %>% as.data.frame() %>%
unite(index, seqnames, start,sep = ":", remove = FALSE) %>%
unite(index, index, end, sep = "-", remove = FALSE) %>%
filter(symbol %in% rna_genes$"0")# %>% #head
#column_to_rownames(index)
#column_to_rownames("index") %>%
# GRanges()
rna_sce <- SummarizedExperiment(assays = list(counts = rna_mat),
rowData = gene_anno,
#rowRanges = gene_anno,
colData = DataFrame(df))
rna_mat <- assays(rna_sce)[[1]]
metadata <- colData(rna_sce) %>% as.data.frame()
gatas <- c("Gata1", "Gata2", "Gata3", "Gata4", "Gata5", "Gata6")
for (n in gatas){
motif_n <- rna_mat[rownames(rna_mat) == n, ]#tfs[grepl(paste0("^", n), tfs)], ]
p <- metadata %>%
mutate(!!n := motif_n) %>%
group_by(celltypes) %>%
#summarise_at(vars(n), funs(mean(., na.rm=TRUE)))
summarise(mean = mean(!!(sym(n)))) %>%
ggplot() +
geom_bar(aes(x = celltypes, y = mean, fill = celltypes), stat = "identity") +
scale_fill_manual(values = col) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), legend.position = "None") +#%>% print()
labs(title = paste0(n), y = "SEACell score")
print(p)
}